This dataset is from Kaggle
Prepare for analysis
suppressMessages(suppressWarnings(setwd("~/../Desktop/Spring2019/MA681_RENEW_PROJECT/SCRIPT/")))
suppressMessages(suppressWarnings(library(tidyverse)))
suppressMessages(suppressWarnings(library(corrplot)))
tidied_data <- readRDS("crime_data.rds")
head(tidied_data,10)
Preprocess of the analysis
# add distric name into the dataset
distric_name <- c("DOWNTOWN AND CHARLESTOWN-1","DOWNTOWN AND CHARLESTOWN-2",
"EAST BOSTON","ROXBURY","MATTAPAN","DORCHESTER",
"SOUTH BOSTON","BRIGHTON","SOUTH END","JAMAICA PLAIN",
"HYDE PARK","WEST ROXBURY","no report")
distric_code <- sort(unique(as.character(tidied_data$DISTRICT)))
tidied_data$district_name <- apply(tidied_data, 1, FUN = function(x){
id <- which(x[2] == distric_code)
return(distric_name[id])
})
# add the abbreviations for the week day and month, and reorder them
tidied_data <- tidied_data %>%
mutate(MONTH_abb = factor(substr(.$MONTH,1,3),levels = month.abb)) %>%
mutate(WEEKDAY_abb = factor(
ifelse(.$DAY_OF_WEEK == "Thursday",
substr(.$DAY_OF_WEEK,1,4),
substr(.$DAY_OF_WEEK,1,3)),
levels = c("Mon","Tue","Wed","Thur","Fri","Sat","Sun")))
# add the YEAR_MONTH and MONTH_DAY variables, and reorder them
tidied_data <- tidied_data %>%
arrange(YEAR,MONTH_abb)
factor_labels <- paste(as.character(tidied_data$YEAR),
as.character(tidied_data$MONTH_abb),sep = "-")
factor_levels <- unique(factor_labels)
tidied_data$YEAR_MONTH <- factor(factor_labels,levels = factor_levels)
tidied_data <- tidied_data %>%
arrange(MONTH_abb,WEEKDAY_abb)
factor_labels <- paste(as.character(tidied_data$MONTH_abb),
as.character(tidied_data$WEEKDAY_abb),sep = "-")
factor_levels <- unique(factor_labels)
tidied_data$MONTH_DAY <- factor(factor_labels,levels = factor_levels)
# the variables which were added into the dataset
head(tidied_data[,c(2,14,10,3,11,15,16,17,18)],10)
Total number of crimes
tidied_data %>% group_by(crime_date) %>%
summarize(Occurrenes = n()) %>%
ggplot(aes(x = as.Date(crime_date),y = Occurrenes, group = 1)) +
geom_line()+
scale_x_date(breaks=as.Date(c("2015-06-15","2016-04-12","2017-02-09","2017-12-06","2018-10-03")))+
xlab("Crime date")
Number of crimes in different districs in different months. Different districs, and times could be selected.
# the lower and upper bound should be "2015-Jul","2018-Sep", because 2015 jun and 2018 oct data are not complete
PlotDistrictsCrimesMonthly <- function(tidied_data, lower_year_month, upper_year_month, district_list = distric_name){
x <- tidied_data %>% arrange(YEAR_MONTH)
X_axis <- unique(x$YEAR_MONTH) %>%
.[grep(lower_year_month,.):grep(upper_year_month,.)]
BREAKS <- X_axis[floor(as.vector(quantile(1:length(X_axis))))]
x <- x %>%
filter(YEAR_MONTH %in% X_axis) %>%
filter(district_name %in% district_list) %>%
group_by(YEAR_MONTH, district_name) %>%
summarize(Occurrences = n()) %>%
na.omit()
p <- x %>%
ggplot(aes(x = YEAR_MONTH,y = Occurrences, group = district_name,color = district_name)) +
geom_line()+
theme(legend.text = element_text(size = 7)) +
scale_x_discrete(breaks = BREAKS,label = BREAKS) +
xlab("Time: Year-Month") +
ggtitle(paste("Occurrences of crimes in different month from ", lower_year_month, " to ", upper_year_month))
p
}
Number of crimes in different districs in different week days. The occurrences are the sums for each month in each year in the dataset.
PlotDistrictsCrimesDaily <- function(tidied_data, Start_Month_Day, End_Month_Day, District_List = distric_name){
x <- tidied_data %>% arrange(MONTH_DAY)
X_axis <- unique(x$MONTH_DAY) %>%
.[grep(Start_Month_Day,.):grep(End_Month_Day,.)]
BREAKS <- X_axis[floor(as.vector(seq(1,length(X_axis),7)))]
x <- x %>%
filter(YEAR %in% c(2016,2017)) %>%
filter(MONTH_DAY %in% X_axis) %>%
filter(district_name %in% District_List) %>%
group_by(MONTH_DAY, district_name) %>%
summarize(Occurrences = n())
p <- x %>%
ggplot(aes(x = MONTH_DAY,y = Occurrences, group = district_name,color = district_name)) +
geom_line()+
theme(axis.text.x = element_text(size = 7.5))+
scale_x_discrete(breaks = BREAKS,label = BREAKS) +
xlab("Time: Month-Day")+
ggtitle(paste("Occurrences of crimes in different weekday from ", Start_Month_Day, " to ", End_Month_Day))
p
}
tidied_data %>% PlotDistrictsCrimesMonthly("2016-Jan","2017-Dec",c("SOUTH BOSTON","SOUTH END","WEST ROXBURY","ROXBURY","EAST BOSTON"))
tidied_data %>% PlotDistrictsCrimesMonthly("2015-Jul","2018-Sep",c("DOWNTOWN AND CHARLESTOWN-1","DOWNTOWN AND CHARLESTOWN-2","EAST BOSTON","SOUTH END"))
tidied_data %>% PlotDistrictsCrimesDaily("Jan-Mon","Apr-Sun",c("DOWNTOWN AND CHARLESTOWN-1","DOWNTOWN AND CHARLESTOWN-2","EAST BOSTON","SOUTH END"))
Boxplot for Occurences in each month
x <- tidied_data %>%
filter(YEAR %in% c(2016,2017)) %>%
group_by(MONTH, crime_date) %>%
summarize(Occurrences = n())
day_mean <- mean(x$Occurrences)
x %>%
ggplot(aes(x = MONTH,y = Occurrences)) +
geom_boxplot(aes(fill = MONTH))+
geom_hline(yintercept = day_mean, linetype = 2, colour = "black", size = 1.25)+ # mean of day crime occurrences
theme(axis.text.x = element_text(angle = 90))+
xlab("Month") +
ggtitle("Occurrences of crimes in each month")
Top crimes type in Huntington Avenue.
PlotOneStreetCrimes <- function(tidied_data, street_name, top = 10){
p <- tidied_data %>%
group_by(STREET,OFFENSE_CODE_GROUP) %>%
summarize(Occurrcens = n()) %>%
filter((STREET == street_name) &
(Occurrcens >= sort(Occurrcens,decreasing = T)[top])) %>%
ggplot(aes(OFFENSE_CODE_GROUP, Occurrcens, fill = OFFENSE_CODE_GROUP))+
geom_bar(stat = "identity")+
theme(axis.text.x = element_blank()) +
xlab("Crime types") +
ggtitle(paste("Top ",top," crime types in ",street_name,sep = ""))
p
}
Comparison between crime types in streets.
PlotMultipleStreetsCrimes <- function(tidied_data,streets_list,crimes_list){
p <- tidied_data %>%
mutate(Crime_types = ifelse(OFFENSE_CODE_GROUP %in% crimes_list,OFFENSE_CODE_GROUP, "Others")) %>%
group_by(STREET,Crime_types) %>%
summarize(Occurrcens = n()) %>%
filter((STREET %in% streets_list) &
(Crime_types != "Others")) %>%
ggplot(aes(STREET, Occurrcens, fill = Crime_types))+
geom_bar(stat = "identity")+
coord_flip()+
theme(legend.position = "right")+
xlab("Crime types") +
ggtitle(paste("Occurrences of crimes in streets"))
p
}
Crimes_types <- c("Larceny","Larceny From Motor Vehicle","Simple Assault","Aggravated Assault")
Streets_names <- c("HUNTINGTON AVE","BOYLSTON ST","COLUMBUS AVE","MASSACHUSETTS AVE","NEWBURY ST")
tidied_data %>% PlotOneStreetCrimes("HUNTINGTON AVE",10)
tidied_data %>% PlotOneStreetCrimes("COMMONWEALTH AVE",10)
tidied_data %>% PlotMultipleStreetsCrimes(Streets_names, Crimes_types)
Correlation between different types of crimes
tidied_data %>%
group_by(crime_date,OFFENSE_CODE_GROUP) %>%
summarize(Counts = n()) %>%
spread(OFFENSE_CODE_GROUP,Counts) %>%
remove_rownames() %>%
column_to_rownames("crime_date") %>%
apply(2,FUN = function(x){
x[is.na(x)] <- 0
return(x)
}) %>%
cor() %>%
corrplot(type = "upper", method = "ellipse", tl.col = "black", tl.srt = 45, number.cex = .35,tl.cex = .25,outline = FALSE)
Locations for crime using the longitude and latitude.
temp <- tidied_data %>%
na.omit() %>%
filter(Lat != -1 & Long != -1)
temp %>%
ggplot(aes(x = Long, y = Lat, color = district_name)) +
geom_point(alpha = .1) +
guides(colour = guide_legend(override.aes = list(alpha = 1))) +
theme(panel.background = element_rect(fill = "white"),
panel.grid = element_line(color = "black"))